In [2]:
! sudo pip -q install pandas


/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/_vendor/requests/packages/urllib3/util/ssl_.py:318: SNIMissingWarning: An HTTPS request has been made, but the SNI (Subject Name Indication) extension to TLS is not available on this platform. This may cause the server to present an incorrect TLS certificate, which can cause validation failures. You can upgrade to a newer version of Python to solve this. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#snimissingwarning.
  SNIMissingWarning
/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/_vendor/requests/packages/urllib3/util/ssl_.py:122: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. You can upgrade to a newer version of Python to solve this. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/_vendor/requests/packages/urllib3/util/ssl_.py:122: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. You can upgrade to a newer version of Python to solve this. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning

In [4]:
! cd ~/pynb/fb15k-akbc
! wget -q https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.zip
! unzip FB15K-237.zip


Archive:  FB15K-237.zip
  inflating: Release/README.txt      
  inflating: Release/test.txt        
  inflating: Release/text_cvsc.txt   
  inflating: Release/text_emnlp.txt  
  inflating: Release/train.txt       
  inflating: Release/valid.txt       

In [3]:
import pandas as pd

In [5]:
BASE_DIR = './Release/'
TRAIN_FILE = BASE_DIR + 'train.txt'
TEST_FILE = BASE_DIR + 'test.txt'
VALID_FILE = BASE_DIR + 'valid.txt'
TEXT_CVSC_FILE = BASE_DIR + 'text_cvsc.txt'
TRAIN_CSV_FILE = 'fb15k_train.csv'
VALID_CSV_FILE = 'fb15k_valid.csv'
TEST_CSV_FILE = 'fb15k_test.csv'
CVSC_ENTITIES_CSV_FILE = 'fb15k_cvsc_entities.csv'
CVSC_TRAIN_CSV_FILE = 'fb15k_cvsc_train.csv'
CVSC_PAIRS_CSV_FILE = 'fb15k_cvsc_pairs.csv'
CVSC_RELATIONS_CSV_FILE = 'fb15k_cvsc_relations.csv'
ENTITY_PAIRS = {}
RELATIONS = {}

In [6]:
def index(val, idx):
    if val not in idx:
        idx[val] = len(idx)
    return idx[val]

In [7]:
def add_id_columns(df):
    df['pair'] = df['subj'] + ':' + df['obj']
    df['pid'] = df['pair'].apply(lambda x: index(x, ENTITY_PAIRS))
    df['rid'] = df['rel'].apply(lambda x: index(x, RELATIONS))

Prepare KB triples train/validate/test sets


In [8]:
train_kb_triples = pd.read_csv(TRAIN_FILE, sep='\t', names=['subj', 'rel', 'obj'])
add_id_columns(train_kb_triples)
print 'Train KB triples:', len(train_kb_triples)
train_kb_triples.to_csv(TRAIN_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid'])
print 'Saved to', TRAIN_CSV_FILE


Train KB triples: 272115
Saved to fb15k_train.csv

In [16]:
valid_kb_triples = pd.read_csv(VALID_FILE, sep='\t', names=['subj', 'rel', 'obj'])
add_id_columns(valid_kb_triples)
print 'Validation KB triples:', len(valid_kb_triples)
valid_kb_triples.to_csv(VALID_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid'])
print 'Saved to', VALID_CSV_FILE


Validation KB triples: 17535
Saved to fb15k_valid.csv

In [10]:
test_kb_triples = pd.read_csv(TEST_FILE, sep='\t', names=['subj', 'rel', 'obj'])
add_id_columns(test_kb_triples)
print 'Test KB triples:', len(test_kb_triples)
test_kb_triples.to_csv(TEST_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid'])
print 'Saved to', TEST_CSV_FILE


Test KB triples: 20466
Saved to fb15k_test.csv

Prepare CVSC datasets


In [11]:
cvsc_text_triples = pd.read_csv(TEXT_CVSC_FILE, sep='\t', names=['subj', 'rel', 'obj', 'occ'])
add_id_columns(cvsc_text_triples)
print 'Text triples (CVSC):', len(cvsc_text_triples)
cvsc_train_triples = pd.concat([train_kb_triples, cvsc_text_triples], join="outer")
print 'Training triples (CVSC):', len(cvsc_train_triples)
cvsc_train_triples.to_csv(CVSC_TRAIN_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid', 'occ'])
print 'Saved to', CVSC_TRAIN_CSV_FILE


Text triples (CVSC): 6600401
Training triples (CVSC): 6872516
Saved to fb15k_cvsc_train.csv

In [12]:
cvsc_entities = cvsc_text_triples['subj'].combine_first(cvsc_text_triples['obj']).drop_duplicates()
cvsc_entities.name = "entity"
print 'Entities:', len(cvsc_entities)
cvsc_entities.to_csv(CVSC_ENTITIES_CSV_FILE, sep='\t', header=True)
print 'Saved to', CVSC_ENTITIES_CSV_FILE


Entities: 14282
Saved to fb15k_cvsc_entities.csv

In [13]:
cvsc_pairs = cvsc_train_triples[['subj', 'obj', 'pid']].drop_duplicates()
print 'Entity pairs (CVSC):', len(cvsc_pairs)
cvsc_pairs.to_csv(CVSC_PAIRS_CSV_FILE, sep='\t', header=True, columns=['subj', 'obj', 'pid'])
print 'Saved to', CVSC_PAIRS_CSV_FILE


Entity pairs (CVSC): 2966835
Saved to fb15k_cvsc_pairs.csv

In [14]:
cvsc_relations = cvsc_train_triples[['rel', 'rid']].drop_duplicates()
print 'Relations (CVSC):', len(cvsc_relations)
cvsc_relations.to_csv(CVSC_RELATIONS_CSV_FILE, sep='\t', header=True, columns=['rel', 'rid'])
print 'Saved to', CVSC_RELATIONS_CSV_FILE


Relations (CVSC): 26154
Saved to fb15k_cvsc_relations.csv

In [15]:
print 'Pairs:', cvsc_train_triples['pid'].max() + 1
print 'Relations:', cvsc_train_triples['rid'].max() + 1


Pairs: 2995738
Relations: 26154

In [ ]: